How many of the domains identified using our approach are in the ELM database (as compared to the population of all domains we tested)?
Calculate empirical pvalues using either frequency of a domain among interacting partners of viral protein or Fisher test p-value
# frequency function: set up standard parameters
permuteFrequency = function(data, select_nodes = NULL, also_permuteYZ = F){
res = permutationPval(interactions2permute = IDs_interactor_viral ~ IDs_interactor_human, # first set of interacting pairs (XY) that are to be permuted
associations2test = IDs_interactor_viral ~ IDs_domain_human, # set of interacting pairs to be tested (XZ), YZ interactions are assumed
node_attr = list(IDs_interactor_viral ~ IDs_interactor_viral_degree,
IDs_domain_human ~ domain_count,
IDs_interactor_viral + IDs_domain_human ~ domain_frequency_per_IDs_interactor_viral),
data = data,
statistic = IDs_interactor_viral + IDs_domain_human ~ .N / IDs_interactor_viral_degree,
select_nodes = select_nodes,
N = N_permut,
cores = cores_to_use, seed = 2, also_permuteYZ = also_permuteYZ)
return(res)
}
data = fread("./processed_data_files/viral_human_net_w_domains", sep = "\t", stringsAsFactors = F)
# frequency: all proteins and domains - # permute IDs_interactor_viral ~ IDs_interactor_human
time = proc.time()
res = permuteFrequency(data, select_nodes = NULL)
proc.time() - time
## user system elapsed
## 1.006 0.054 12.122
plot(res, main = "frequency: all proteins and domains")

# permute BOTH IDs_interactor_viral ~ IDs_interactor_human AND IDs_interactor_human ~ IDs_domain_human
time = proc.time()
res_both = permuteFrequency(data, select_nodes = NULL, also_permuteYZ = T)
proc.time() - time
## user system elapsed
## 1.243 0.035 12.511
plot(res_both, main = "frequency: all proteins and domains")

# frequency: no low background count domains
res_low_back = permuteFrequency(data, select_nodes = IDs_domain_human ~ domain_count >= 3)
res_low_back_alt = permuteFrequency(data[domain_count >= 3,])
all.equal(res_low_back$data_with_pval[complete.cases(res_low_back$data_with_pval),p.value], res_low_back_alt$data_with_pval[complete.cases(res_low_back_alt$data_with_pval),p.value])
## [1] "Mean relative difference: 0.1645512"
plot(res_low_back, main = "frequency: no low background count domains (>= 3)")

# frequency: not considering (fixing interactions, degree of every node in the network stays the same, but only high degree proteins are taken into account, equivalent to permuting only interactions of protein with the degree higher than 1) viral proteins with the degree of 1 - removing viral proteins with the degree of 1
res_low_deg = permuteFrequency(data, select_nodes = IDs_interactor_viral ~ IDs_interactor_viral_degree >= 2)
res_low_deg_alt = permuteFrequency(data[IDs_interactor_viral_degree >= 2,])
all.equal(res_low_deg$data_with_pval[complete.cases(res_low_deg$data_with_pval),p.value], res_low_deg_alt$data_with_pval[complete.cases(res_low_deg_alt$data_with_pval),p.value])
## [1] TRUE
plot(res_low_deg, main = "frequency: not considering viral proteins with the degree of 1")

# frequency: BOTH no low background count domains AND removing viral proteins with the degree of 1
res_low_deg_back = permuteFrequency(data, select_nodes = list(IDs_domain_human ~ domain_count >= 3,
IDs_interactor_viral ~ IDs_interactor_viral_degree >= 2))
res_low_deg_back_alt = permuteFrequency(data[domain_count >= 3 & IDs_interactor_viral_degree >= 2,])
all.equal(res_low_deg_back$data_with_pval[complete.cases(res_low_deg_back$data_with_pval),p.value], res_low_deg_back_alt$data_with_pval[complete.cases(res_low_deg_back_alt$data_with_pval),p.value])
## [1] "Mean relative difference: 0.1761649"
plot(res_low_deg_back, main = "frequency: no low background count domains (>= 3)\nAND not considering viral proteins with the degree of 1")

# Fisher test: set up standard parameters
permuteFisherTest = function(data, select_nodes = NULL, also_permuteYZ = F){
resFISHER = permutationPval(interactions2permute = IDs_interactor_viral ~ IDs_interactor_human, # first set of interacting pairs (XY) that are to be permuted
associations2test = IDs_interactor_viral ~ IDs_domain_human, # set of interacting pairs to be tested (XZ), YZ interactions are assumed
node_attr = list(IDs_interactor_viral ~ IDs_interactor_viral_degree, # attribute of X
IDs_domain_human ~ domain_count + N_prot_w_interactors, # attributes of Z
IDs_interactor_viral + IDs_domain_human ~ domain_count_per_IDs_interactor_viral), # attribute of both X and Z
data = data, # data.table containing data
statistic = IDs_interactor_viral + IDs_domain_human ~ fisher.test(
matrix(c(unique(domain_count_per_IDs_interactor_viral),
unique(IDs_interactor_viral_degree) - unique(domain_count_per_IDs_interactor_viral),
unique(domain_count),
unique(N_prot_w_interactors) - unique(domain_count)),
2,2),
alternative = "greater", conf.int = F)$p.value, # formula to calculate statisic by evaluating right-hand-side expression for each X and Z pair, right-hand-side expression is what is normally put in j in data.table DT[i, j, by], left-hand-side expression contains column names of X and Z which are used in by in data.table
select_nodes = select_nodes, # select a subset of the data, only nodes
N = N_permut, # number of permutations
cores = cores_to_use, seed = 1, also_permuteYZ = also_permuteYZ)
# permutationPval returns the number of cases when permuted statitic is higher than the observed statistic (right tail of the distribution), in this case we are interested in the reverse - the lower tail, when p-values from permuted distribution that are lower than the observed p-value
resFISHER$data_with_pval[, p.value := 1 - p.value]
return(resFISHER)
}
# contingency matrix:
matrix(c("unique(domain_count_per_IDs_interactor_viral)",
"unique(IDs_interactor_viral_degree) - unique(domain_count_per_IDs_interactor_viral)",
"unique(domain_count)",
"unique(N_prot_w_interactors) - unique(domain_count)"),2,2)
## [,1]
## [1,] "unique(domain_count_per_IDs_interactor_viral)"
## [2,] "unique(IDs_interactor_viral_degree) - unique(domain_count_per_IDs_interactor_viral)"
## [,2]
## [1,] "unique(domain_count)"
## [2,] "unique(N_prot_w_interactors) - unique(domain_count)"
# permute IDs_interactor_viral ~ IDs_interactor_human
time = proc.time()
resFISHER = permuteFisherTest(data, select_nodes = NULL)
proc.time() - time
## user system elapsed
## 3.956 0.056 17.936
plot(resFISHER, main = "Fisher test P-value: all proteins and domains")
# permute BOTH IDs_interactor_viral ~ IDs_interactor_human AND IDs_interactor_human ~ IDs_domain_human
time = proc.time()
resFISHER_both = permuteFisherTest(data, select_nodes = NULL, also_permuteYZ = T)
proc.time() - time
## user system elapsed
## 3.904 0.048 17.558
plot(resFISHER, main = "Fisher test P-value: all proteins and domains")

Viral protein degree and the background domain count of top-scoring proteins
# function to accomodate ggplot2::geom_bin2d in GGally::ggpairs, taken from http://ggobi.github.io/ggally/#custom_functions
d2_bin_log10 <- function(data, mapping, ..., low = "#132B43", high = "#56B1F7") {
ggplot(data = data, mapping = mapping) +
geom_bin2d(...) +
scale_fill_gradient(low = low, high = high) +
scale_y_log10() + scale_x_log10()
}
log10_density = function(data, mapping, ...){
ggplot(data = data, mapping = mapping) +
geom_density(...) +
scale_x_log10()
}
PermutResult2D = function(res, N){
res_temp = unique(res$data_with_pval[,.(IDs_interactor_viral, IDs_domain_human,
domain_count,
IDs_interactor_viral_degree,
domain_count_per_IDs_interactor_viral,
p.value)])
GGally::ggpairs(res_temp[order(p.value, decreasing = F)[1:N],.(domain_count,
IDs_interactor_viral_degree,
domain_count_per_IDs_interactor_viral,
p.value)],
lower = list(continuous = d2_bin_log10)#,
#diag = list(continuous = geom_density)
) +
theme_light() +
theme(strip.text.y = element_text(angle = 0, size = 10),
strip.text.x = element_text(angle = 90, size = 10))
}
# frequency of a domain
PermutResult2D(res = res, N = 250) +
ggtitle("2D-bin plots of 250 top-scoring viral protein - human domain pairs, \n statistic: frequency of a domain among interacting partners of a viral protein")
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).

# frequency of a domain
PermutResult2D(res = res_both, N = 250) +
ggtitle("2D-bin plots of 250 top-scoring viral protein - human domain pairs, \n statistic: frequency of a domain among interacting partners of a viral protein \n permute BOTH IDs_interactor_viral ~ IDs_interactor_human AND IDs_interactor_human ~ IDs_domain_human")
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).

# frequency of a domain: no low background count domains
PermutResult2D(res = res_low_back, N = 250) +
ggtitle("2D-bin plots of 250 top-scoring viral protein - human domain pairs, \n statistic: frequency of a domain among interacting partners of a viral protein \n no low background count domains")
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).

# frequency of a domain: no viral proteins with the degree of 1
PermutResult2D(res = res_low_deg, N = 250) +
ggtitle("2D-bin plots of 250 top-scoring viral protein - human domain pairs, \n statistic: frequency of a domain among interacting partners of a viral protein \n no viral proteins with the degree of 1")
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).

# frequency of a domain: no low background count domains AND no viral proteins with the degree of 1
PermutResult2D(res = res_low_deg_back, N = 250) +
ggtitle("2D-bin plots of 250 top-scoring viral protein - human domain pairs, \n statistic: frequency of a domain among interacting partners of a viral protein \n no low background count domains \nAND no viral proteins with the degree of 1")
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).

# Fisher test p-value
PermutResult2D(res = resFISHER, N = 250) +
ggtitle("2D-bin plots of 250 top-scoring viral protein - human domain pairs, \n statistic: Fisher test p-value")
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).

# Fisher test p-value
PermutResult2D(res = resFISHER_both, N = 250) +
ggtitle("2D-bin plots of 250 top-scoring viral protein - human domain pairs, \n statistic: Fisher test p-value \n permute BOTH IDs_interactor_viral ~ IDs_interactor_human AND IDs_interactor_human ~ IDs_domain_human")
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is
## zero
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 250 rows containing non-finite values (stat_bin2d).

Map domains known to interact with linear motifs from ELM to the domains we found
interactiondomains = fread("http://elm.eu.org/interactiondomains.tsv")
interactiondomains[, pfam_id := `Interaction Domain Id`]
domains_known = interactiondomains[, unique(pfam_id)]
InterProScan_domains = readInterProGFF3("../viral_project/processed_data_files/all_human_viral_protein_domains.gff3.gz")
# get InterProID to member database ID mapping
InterPro2memberDB = getInterPro2memberDB(InterProScan_domains)
InterPro2memberDB = InterPro2memberDB[complete.cases(InterPro2memberDB)]
domains_known_mapped = unique(InterPro2memberDB[memberDBID %in% domains_known | InterProID %in% domains_known, InterProID])
domains_not_mapped = unique(domains_known[!(domains_known %in% InterPro2memberDB$memberDBID | domains_known %in% InterPro2memberDB$InterProID)])
I did Fisher test to evaluate if the domains that we find are enriched in domains known to interact with linear motifs (from ELM). I have picked some number of viral protein - human domain associations from the top (by p-value). Then I counted how many known domains we have found and did Fisher test. I decided to compare two statistic choices (frequency of a domain among interacting partners of a viral protein or Fisher test p-value) on how many of the known domains we tend to find. Finally, I was choosing different cutoffs (different number of top p-value pairs).
test_enrichment = function(N, res, domains_known_mapped, random = F, name = ""){
if(random) {
res$data_pval = unique(res$data_with_pval[,.(IDs_interactor_viral, IDs_domain_human, p.value, domain_type, domain_count, IDs_interactor_viral_degree)])
domains_found = res$data_pval[sample(1:nrow(res$data_with_pval), N), unique(IDs_domain_human)]
}
if(!random){
res$data_pval = unique(res$data_with_pval[,.(IDs_interactor_viral, IDs_domain_human, p.value, domain_type, domain_count, IDs_interactor_viral_degree)])
# res$data_pval[, pval_fdr := p.adjust(p.value, method = "fdr")]
# hist(res$data_pval[, pval_fdr], breaks = seq(0,1,0.01))
domains_found = res$data_pval[order(p.value, decreasing = F)[1:N], unique(IDs_domain_human)]
}
alldomains = res$data_with_pval[, unique(IDs_domain_human)]
known = factor(alldomains %in% domains_known_mapped, levels = c("TRUE", "FALSE"))
found = factor(alldomains %in% domains_found, levels = c("TRUE", "FALSE"))
table_res = table(known, found)
test = fisher.test(table(known, found), alternative = "greater", conf.int = T)
return(c(pval = test$p.value, odds_ratio = as.vector(test$estimate), count = table_res["TRUE", "TRUE"], name = name))
}
Ns = seq(25, 500, 25)
enrichment = sapply(Ns, test_enrichment, res, domains_known_mapped, name = "domain frequency among interactors of a viral protein")
colnames(enrichment) = Ns
enrichment_both = sapply(Ns, test_enrichment, res_both, domains_known_mapped, name = "domain frequency among interactors of a viral protein, permute both")
colnames(enrichment_both) = Ns
enrichment_low_back = sapply(Ns, test_enrichment, res_low_back, domains_known_mapped, name = "domain frequency: no low background")
colnames(enrichment_low_back) = Ns
enrichment_low_deg = sapply(Ns, test_enrichment, res_low_deg, domains_known_mapped, name = "domain frequency: no degree of 1")
colnames(enrichment_low_deg) = Ns
enrichment_low_deg_back = sapply(Ns, test_enrichment, res_low_deg_back, domains_known_mapped, name = "domain frequency: no degree of 1 AND no low background")
colnames(enrichment_low_deg_back) = Ns
enrichmentFISHER = sapply(Ns, test_enrichment, resFISHER, domains_known_mapped, name = "Fisher test pval: domain overrepresentation over the background")
colnames(enrichmentFISHER) = Ns
enrichmentFISHER_both = sapply(Ns, test_enrichment, resFISHER_both, domains_known_mapped, name = "Fisher test pval: domain overrepresentation over the background, permute both")
colnames(enrichmentFISHER_both) = Ns
random_domains = function(N = 100, seed = seed, Ns = seq(25, 500, 25)){
set.seed(seed)
quantiles = c(0.975, 0.75, 0.5, 0.25, 0.025)
quantile_names = c("97.5% quantile", "75% quantile", "median", "25% quantile", "2.5% quantile")
pval_temp = replicate(N, {
enrichmentRANDOM = sapply(Ns, test_enrichment, res, domains_known_mapped, random = T, name = "N random proteins")[1,]
names(enrichmentRANDOM) = Ns
as.numeric(enrichmentRANDOM)
})
pval = apply(pval_temp, 1, quantile, probs = quantiles)
rownames(pval) = quantile_names
colnames(pval) = Ns
odds_ratio_temp = replicate(N, {
enrichmentRANDOM = sapply(Ns, test_enrichment, res, domains_known_mapped, random = T, name = "N random proteins")[2,]
names(enrichmentRANDOM) = Ns
as.numeric(enrichmentRANDOM)
})
odds_ratio = apply(odds_ratio_temp, 1, quantile, probs = quantiles)
rownames(odds_ratio) = quantile_names
colnames(odds_ratio) = Ns
count_temp = replicate(N, {
enrichmentRANDOM = sapply(Ns, test_enrichment, res, domains_known_mapped, random = T, name = "N random proteins")[3,]
names(enrichmentRANDOM) = Ns
as.numeric(enrichmentRANDOM)
})
count = apply(count_temp, 1, quantile, probs = quantiles)
rownames(count) = quantile_names
colnames(count) = Ns
return(list(pval = pval, odds_ratio = odds_ratio, count = count))
}
enrichmentRANDOM = random_domains(100, 1)
As we include more proteins, the number of known domains we find increases and then levels off (probably because some of the known domains do not interact with viral proteins).
plotEnrichment = function(..., random_domains = NULL, domains_known_mapped, type = "count", plot_type = plot_type){
res = list(...)
typenum = match(type, c("pval", "odds_ratio", "count"))
ngroups = length(res)
if(type == "count") color = brewer.pal(ngroups + 1, "Dark2") else color = brewer.pal(ngroups, "Dark2")
if(is.na(typenum)) stop("'type' should be one of “count”, “odds_ratio”, “pval”")
leg_pos_y = max(sapply(res, function(x, typenum) max(as.numeric(x[typenum,])), typenum))
if(!is.null(random_domains)) leg_pos_y = max(leg_pos_y, random_domains[typenum][[1]])
if(type == "count") leg_pos_y = length(domains_known_mapped) - 1
leg_pos_x = max(sapply(res, function(x) max(as.numeric(colnames(x))))) * 0.16
if(type == "pval") {ylim = c(0, 1); ylab = "p-value"}
if(type == "count") {ylim = c(0,length(domains_known_mapped)+1); ylab = "known domain found"}
if(type == "odds_ratio") {ylim = c(0,leg_pos_y); ylab = "Fisher test odds ratio"}
plot(colnames(res[[1]]),rep(0,ncol(res[[1]])),
ylab = ylab, xlab = "top N viral protein - domain pairs selected",
type = plot_type, ylim = ylim, lwd = 0)
# plot random domains quantiles
if(!is.null(random_domains)){
random_legend = c("97.5% quantile", "75% quantile", "median", "25% quantile", "2.5% quantile")
random_cols = c("#DDDDDD", "#CCCCCC", "#AAAAAA", "#CCCCCC", "#DDDDDD")
random_line_width = c(2,4,8,4,2)
for (i in 1:5) {
lines(x = colnames(random_domains[typenum][[1]]), y = random_domains[typenum][[1]][random_legend[i],], col = random_cols[i], lwd = random_line_width[i], type = plot_type)
}
}
for (i in 1:ngroups) {
lines(x = colnames(res[[i]]), y = res[[i]][typenum,], col = color[i], type = plot_type, lwd = 3)
}
if(type == "count") abline(h = length(domains_known_mapped), col = color[ngroups + 1])
legend_names = c("statictic used in permutation test:")
for(i in 1:ngroups){
legend_names = c(legend_names, unique(res[[i]]["name",]))
}
if(type == "count") legend_names = c(legend_names, "domains known to interact with linear motifs")
line_width = rep(2, length(color) + 1)
if(!is.null(random_domains)){
legend_names = c(legend_names, paste0("N random protein-domain pairs, ", random_legend))
color = c(color, random_cols)
line_width = c(line_width, random_line_width)
}
legend(x = leg_pos_x, y = leg_pos_y, legend_names,
col = c("white", color), lty = 1, lwd = line_width, merge = TRUE)
}
plotEnrichment(enrichment, enrichment_both, enrichment_low_back, enrichment_low_deg, enrichment_low_deg_back,
enrichmentFISHER, enrichmentFISHER_both,
random_domains = enrichmentRANDOM,
domains_known_mapped = domains_known_mapped, type = "count", plot_type = "l")

As we include more proteins, the Fisher test odds ratio decreases (we add more stuff that is not known). Odds ratio measures how much more likely are we to find a domain using our procedure if it’s a known domain as compared to if it’s not a known domain.
plotEnrichment(enrichment, enrichment_both, enrichment_low_back, enrichment_low_deg, enrichment_low_deg_back,
enrichmentFISHER, enrichmentFISHER_both,
random_domains = enrichmentRANDOM,
domains_known_mapped = domains_known_mapped, type = "odds_ratio", plot_type = "l")

corresponding P-values from the Fisher test
plotEnrichment(enrichment, enrichment_both, enrichment_low_back, enrichment_low_deg, enrichment_low_deg_back,
enrichmentFISHER, enrichmentFISHER_both,
random_domains = enrichmentRANDOM,
domains_known_mapped = domains_known_mapped, type = "pval", plot_type = "l")

How many viral proteins are known per each of the domain instances in top 200 protein-domain pairs?
selectTopHits = function(res, N){
res$data_pval = unique(res$data_with_pval[,.(IDs_interactor_viral, IDs_domain_human, p.value, domain_type, domain_count, IDs_interactor_viral_degree)])
pairs200pval = res$data_pval[order(p.value, decreasing = F)[1:N], max(p.value)]
restop100 = res
restop100$data_with_pval = restop100$data_with_pval[p.value <= pairs200pval, ]
restop100$data_with_pval[, N_viral_per_human_w_domain := length(unique(IDs_interactor_viral)), by = .(IDs_interactor_human, IDs_domain_human)]
return(restop100)
}
restop100 = selectTopHits(res, N = 200)
plot(restop100, IDs_interactor_human ~ N_viral_per_human_w_domain)

plot(restop100)

49 human proteins with enriched domains have 5 or more viral interacting partners.
11 human proteins with enriched domains have 10 or more viral interacting partners.
what are those domains? are they known ELM-interacting domains? which proteins they are in? which viral taxons they interact with?
restop100$data_with_pval[N_viral_per_human_w_domain >= 10, unique(IDs_domain_human)]
## [1] "IPR000504" "IPR001452" "IPR001478" "IPR018316"
restop100$data_with_pval[N_viral_per_human_w_domain >= 10, unique(IDs_domain_human)] %in% domains_known_mapped
## [1] TRUE TRUE TRUE FALSE
restop100$data_with_pval[N_viral_per_human_w_domain >= 10 & IDs_domain_human == "IPR000504", unique(IDs_interactor_human)]
## [1] "O43390" "P09651" "P11940" "P22626" "P31943" "P38159" "Q99729"
restop100$data_with_pval[N_viral_per_human_w_domain >= 10 & IDs_domain_human == "IPR000504", unique(Taxid_interactor_viral)]
## [1] 10254 380964 88776 130763 10299 270485 211044 10377 10376 10298
## [11] 680716 28344 333284 128952 11097 10884 121791 10249 796210 381518
## [21] 387139 643960
DT::datatable(restop100$data_with_pval[N_viral_per_human_w_domain >= 10,])